import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# define GPU for BERT model
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
import pandas as pd
from matplotlib import pyplot as plt
import wordcloud
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import nltk
import spacy
from gensim import corpora
from gensim.models import LsiModel, LdaModel
from gensim.models.callbacks import PerplexityMetric
from tqdm.notebook import tqdm
import pyLDAvis.gensim
import gensim.downloader as api
from gensim.models import Word2Vec
import gensim
import numpy as np
from sklearn.cluster import KMeans
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from bokeh.io import output_notebook
import bokeh.models as bm, bokeh.plotting as pl
from nltk.tokenize import WordPunctTokenizer
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
from collections import defaultdict
import umap
import hdbscan
from sklearn.cluster import DBSCAN
# from spacy.cli.download import download
# download(model="en_core_web_sm")
# nltk.download('stopwords')
df = pd.read_csv('emails.csv')
df.head(3)
| file | message | |
|---|---|---|
| 0 | allen-p/_sent_mail/1. | Message-ID: <18782981.1075855378110.JavaMail.e... |
| 1 | allen-p/_sent_mail/10. | Message-ID: <15464986.1075855378456.JavaMail.e... |
| 2 | allen-p/_sent_mail/100. | Message-ID: <24216240.1075855687451.JavaMail.e... |
msg = df['message'][0]
print(msg)
Message-ID: <18782981.1075855378110.JavaMail.evans@thyme> Date: Mon, 14 May 2001 16:39:00 -0700 (PDT) From: phillip.allen@enron.com To: tim.belden@enron.com Subject: Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Transfer-Encoding: 7bit X-From: Phillip K Allen X-To: Tim Belden <Tim Belden/Enron@EnronXGate> X-cc: X-bcc: X-Folder: \Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Sent Mail X-Origin: Allen-P X-FileName: pallen (Non-Privileged).pst Here is our forecast
import email
'Email object keys: %s' % email.message_from_string(msg).keys()
"Email object keys: ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName']"
def get_text_from_email(msg_text):
'''To get the content from email objects'''
msg = email.message_from_string(msg_text)
parts = [part.get_payload() for part in msg.walk() if \
part.get_content_type() == 'text/plain']
return ''.join(parts)
get_text_from_email(msg)
'Here is our forecast\n\n '
df['Content'] = list(map(get_text_from_email, df['message']))
messages = list(map(email.message_from_string, df['message']))
for key in ['Date', 'From', 'To', 'Subject']:
df[key] = [doc[key] for doc in messages]
def split_email_addresses(line):
'''To separate multiple email addresses'''
if line:
addrs = line.split(',')
addrs = frozenset(map(lambda x: x.strip(), addrs))
else:
addrs = None
return addrs
def convert_date(data):
# https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior
return pd.to_datetime(data).tz_convert(None).strftime('%a %d %b %Y %H:%M:%S')
df['User'] = df['file'].map(lambda x: x.split('/')[0])
df.drop(['file', 'message'], axis=1, inplace=True)
df['From'] = df['From'].map(split_email_addresses)
df['To'] = df['To'].map(split_email_addresses)
df['Date'] = df['Date'].apply(convert_date)
df['Date'] = pd.to_datetime(df['Date'], infer_datetime_format=True)
df.sample(3)
| Content | Date | From | To | Subject | User | |
|---|---|---|---|---|---|---|
| 472472 | ---------------------- Forwarded by Mark Taylo... | 2000-01-17 15:27:00 | (mark.taylor@enron.com) | (mark.haedicke@enron.com) | Credit Trading OnLine - Results of Legal Due D... | taylor-m |
| 46559 | MY RECORD SHOULD BE 8 AND 2 | 2000-11-07 08:02:00 | (mike.carson@enron.com) | (rgibbs@btubrokers.com) | Re: Week 10 Results | carson-m |
| 131158 | I'm going through the Jan transport invoices. ... | 2000-04-05 12:05:00 | (chris.germany@enron.com) | (brenda.fletcher@enron.com, sherry.anastas@enr... | Jan Transport | germany-c |
df['Content'].iloc[48]
'---------------------- Forwarded by Phillip K Allen/HOU/ECT on 09/12/2000 \n11:22 AM ---------------------------\n\nMichael Etringer\n\n09/11/2000 02:32 PM\n\nTo: Phillip K Allen/HOU/ECT@ECT\ncc: \nSubject: Contact list for mid market\n\nPhillip,\nAttached is the list. Have your people fill in the columns highlighted in \nyellow. As best can we will try not to overlap on accounts. \n\nThanks, Mike\n\n\n'
def good_word(word):
for c in word:
if '0' <= c <= '9':
return False
return True
def clean_text(text):
x = text
text = nltk.RegexpTokenizer('\s+', gaps=True).tokenize(text.lower())
text = ' '.join([word for word in text if good_word(word)])
text = re.sub('<.*>', '', text) # text inside <>, some forwarded info
text = re.sub('\[.*\]', '', text) # text inside [], this is some attachements
text = re.sub('\S*@\S*\s?', '', text) # remote email addresses
text = re.sub('\S*/hou/\S*s?', '', text) # some corporation usernames
text = re.sub('http[s]?://\S+', '', text) # remove links
text = re.sub('www\.\S+', '', text) # remove links
# text = re.sub('.*subject:', '', text) # remain only text subject [works too long, let's implement]
msg_text = 'subject:'
idx = text.find(msg_text)
if idx != -1:
text = text[idx + len(msg_text):].strip()
return text
for field in ['Content', 'Subject']:
df[field] = [clean_text(email) for email in df[field]]
<>:10: DeprecationWarning: invalid escape sequence \s
<>:14: DeprecationWarning: invalid escape sequence \[
<>:15: DeprecationWarning: invalid escape sequence \S
<>:16: DeprecationWarning: invalid escape sequence \S
<>:17: DeprecationWarning: invalid escape sequence \S
<>:18: DeprecationWarning: invalid escape sequence \.
<>:10: DeprecationWarning: invalid escape sequence \s
<>:14: DeprecationWarning: invalid escape sequence \[
<>:15: DeprecationWarning: invalid escape sequence \S
<>:16: DeprecationWarning: invalid escape sequence \S
<>:17: DeprecationWarning: invalid escape sequence \S
<>:18: DeprecationWarning: invalid escape sequence \.
<ipython-input-16-18790927da10>:10: DeprecationWarning: invalid escape sequence \s
text = nltk.RegexpTokenizer('\s+', gaps=True).tokenize(text.lower())
<ipython-input-16-18790927da10>:14: DeprecationWarning: invalid escape sequence \[
text = re.sub('\[.*\]', '', text) # text inside [], this is some attachements
<ipython-input-16-18790927da10>:15: DeprecationWarning: invalid escape sequence \S
text = re.sub('\S*@\S*\s?', '', text) # remote email addresses
<ipython-input-16-18790927da10>:16: DeprecationWarning: invalid escape sequence \S
text = re.sub('\S*/hou/\S*s?', '', text) # some corporation usernames
<ipython-input-16-18790927da10>:17: DeprecationWarning: invalid escape sequence \S
text = re.sub('http[s]?://\S+', '', text) # remove links
<ipython-input-16-18790927da10>:18: DeprecationWarning: invalid escape sequence \.
text = re.sub('www\.\S+', '', text) # remove links
df['Content'].iloc[48]
'contact list for mid market phillip, attached is the list. have your people fill in the columns highlighted in yellow. as best can we will try not to overlap on accounts. thanks, mike'
sentence_lengths = [min(500, len(s.split())) for s in df['Content']]
cnt = 0
for x in sentence_lengths:
if x == 500:
cnt += 1
print('There are %s sentences with length >= 500' % cnt)
plt.figure(figsize=(11, 5))
plt.hist(sentence_lengths, bins=30, color='green')
plt.xlabel('Количество слов в письмах', fontsize=18)
plt.ylabel('Количество писем', fontsize=18)
plt.xlim(0, 400)
plt.show()
There are 34516 sentences with length >= 500
ax = df.groupby(df['Date'].dt.year)['Content'].count().plot(xlim=(1995, 2005), grid=True)
ax.set_xlabel('Year', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
Text(0, 0.5, 'N emails')
ax = df.groupby(df['Date'].dt.dayofweek)['Content'].count().plot(grid=True)
ax.set_xlabel('Day of week', fontsize=15)
ax.set_ylabel('N emails', fontsize=15)
Text(0, 0.5, 'N emails')
ax = df.groupby(df['Date'].dt.hour)['Content'].count().plot(grid=True)
ax.set_xlabel('Hour', fontsize=18)
ax.set_ylabel('N emails', fontsize=18)
Text(0, 0.5, 'N emails')
def plot_wordcloud(text):
fig, ax = plt.subplots(figsize=(16, 10))
wc = wordcloud.WordCloud(width=1200,
height=750,
max_words=200,
stopwords=list(ENGLISH_STOP_WORDS),
background_color='white').generate(text)
ax.imshow(wc)
ax.axis("off")
# print(wc.words_)
subjects = ' '.join(df['Subject'])
plot_wordcloud(subjects)
contents = ' '.join(df.sample(5000)['Content'])
plot_wordcloud(contents)
cnt = defaultdict(int)
for item in df['From']:
sender = list(item)[0]
# sender = sender[:sender.rfind('.')]
cnt[sender] += 1
users, counts = zip(*sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:20])
plt.figure(figsize=(11, 5))
plt.barh(range(20), counts, align='center', color='green', alpha=0.8)
plt.yticks(range(20), users, fontsize=15)
plt.show()
bad_cnt = 0
for item in df['To']:
if item is None:
bad_cnt += 1
continue
for receiver in item:
# receiver = x[:x.rfind('.')]
cnt[receiver] += 1
print('There is %s emails without receivers' % bad_cnt)
There is 21847 emails without receivers
users, counts = zip(*sorted(cnt.items(), key=lambda x: x[1], reverse=True)[:20])
plt.figure(figsize=(11, 5))
plt.barh(range(20), counts, align='center', color='green', alpha=0.8)
plt.yticks(range(20), users, fontsize=15)
plt.show()
sub_df = df[['From', 'To', 'Date']].dropna()
print(sub_df.shape)
# drop emails sending to multiple addresses
sub_df = sub_df.loc[sub_df['To'].map(len) == 1]
print(sub_df.shape)
(495554, 3) (354600, 3)
sub_df = sub_df.groupby(['From', 'To']).count().reset_index()
# unpack frozensets
sub_df['From'] = sub_df['From'].map(lambda x: next(iter(x)))
sub_df['To'] = sub_df['To'].map(lambda x: next(iter(x)))
# rename column
sub_df.rename(columns={'Date': 'count'}, inplace=True)
sub_df.sort_values('count', ascending=False).head(10)
| From | To | count | |
|---|---|---|---|
| 17908 | pete.davis@enron.com | pete.davis@enron.com | 9141 |
| 38033 | vince.kaminski@enron.com | vkaminski@aol.com | 4308 |
| 28920 | enron.announcements@enron.com | all.worldwide@enron.com | 2206 |
| 28935 | enron.announcements@enron.com | all.houston@enron.com | 1701 |
| 26510 | kay.mann@enron.com | suzanne.adams@enron.com | 1528 |
| 38031 | vince.kaminski@enron.com | shirley.crenshaw@enron.com | 1190 |
| 14564 | steven.kean@enron.com | maureen.mcvicker@enron.com | 1014 |
| 26309 | kay.mann@enron.com | nmann@erac.com | 980 |
| 18926 | kate.symes@enron.com | evelyn.metoyer@enron.com | 915 |
| 18930 | kate.symes@enron.com | kerri.thompson@enron.com | 859 |
import networkx as nx
G = nx.from_pandas_edgelist(sub_df, 'From', 'To', edge_attr='count', create_using=nx.DiGraph())
print('Number of nodes: %s, Number of edges: %s' % (G.number_of_nodes(), G.number_of_edges()))
Number of nodes: 25396, Number of edges: 56144
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(11, 5))
ax1.hist(list(dict(G.in_degree(weight='count')).values()), log=True, bins=20, color='green')
ax1.set_xlabel('Степень входа вершин', fontsize=18)
ax2.hist(list(dict(G.out_degree(weight='count')).values()), log=True, bins=20, color='green')
ax2.set_xlabel('Степень выхода вершин', fontsize=18)
Text(0.5, 0, 'Степень выхода вершин')
comp_sizes = []
for nodes in nx.connected_components(G.to_undirected()):
comp_sizes.append(len(nodes))
print('There are %s connected components' % len(comp_sizes))
comp_sizes = np.array(comp_sizes)
cnt = defaultdict(int)
for x in comp_sizes:
cnt[x] += 1
keys = list(cnt.keys())
values = list(cnt.values())
order = np.argsort(values)[::-1]
keys = np.array(keys)[order]
values = np.array(values)[order]
plt.figure(figsize=(11, 5))
plt.barh(range(len(keys)), values, color='green', log=True, )
plt.yticks(range(len(keys)), keys, fontsize=15)
plt.ylabel('Размер компоненты связности', fontsize=18)
plt.xlabel('Количество', fontsize=18)
plt.show()
There are 1430 connected components
df = df.head(50000)
def normalize(word):
return nlp(word)[0].lemma_
nlp = spacy.load("en_core_web_sm")
normalize('dogs'), normalize('playing')
('dog', 'play')
emails = list(df['Content'])
emails = [nltk.RegexpTokenizer(r'\w+').tokenize(email.lower()) for email in emails]
def only_eng_chars(word):
for c in word:
if c > 'z' or c < 'a':
return False
return True
def f(email_words):
return [normalize(word) for word in email_words if word not in stop_words and len(word) > 2 and only_eng_chars(word)]
stop_words = set(nltk.corpus.stopwords.words('english'))
n_cpu = 70
with Pool(n_cpu) as p:
emails = p.map(f, emails)
emails = [email_words for email_words in emails if len(email_words) > 2]
print(f'Total emails extracted: {len(emails)}')
Total emails extracted: 47389
dictionary = corpora.Dictionary(emails)
corpus = [dictionary.doc2bow(email_words) for email_words in emails]
def f(num_topics):
perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics, callbacks=[perplexity_logger],
iterations=200, passes=10)
return perplexity_logger.get_value()
rng = range(3, 10)
n_cpu = len(rng)
with Pool(n_cpu) as p:
pps = p.map(f, rng)
plt.figure(figsize=(9, 6))
plt.plot(rng, pps)
plt.xlabel('Num topics')
plt.ylabel('Perplexity')
plt.title('Perplexity dependent by number of topics')
plt.grid()
plt.show()
def show(model):
str_topics = [topic_w for topic_number, topic_w in model.print_topics()]
str_topics_split = list(map(lambda x: x.split("+"), str_topics))
str_topics_split = [list(map(lambda x: x.split("*")[1].strip()[1:-1], elem)) for elem in str_topics_split]
for topic in str_topics_split:
print(', '.join(topic))
best_num_topics = rng[np.argmin(pps)]
lda = LdaModel(corpus, id2word=dictionary, num_topics=best_num_topics, iterations=200, passes=10)
show(lda)
page, com, mail, please, new, click, order, free, receive, email please, enron, meet, thank, time, schedule, contact, attach, information, report get, know, send, subject, would, thank, week, message, let, think enron, company, market, page, would, state, issue, say, year, price
data_lda = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.enable_notebook()
pyLDAvis.display(data_lda)
path = api.load("glove-twitter-50", return_path=True)
model = KeyedVectors.load_word2vec_format(path)
model_2 = Word2Vec(size=50, min_count=1)
model_2.build_vocab(emails)
total_examples = model_2.corpus_count
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format(path)
model_2.train(emails, total_examples=total_examples, epochs=5)
(22711105, 22718550)
words = np.array(list(set((" ".join([" ".join(x) for x in emails])).split())))
word_vectors = []
bad_count = 0
for x in words:
try:
vec = model_2.wv.get_vector(x)
except KeyError:
bad_count += 1
vec = np.zeros(50)
word_vectors.append(vec)
print('There are %s words from %s without embeddings' % (bad_count, len(words)))
There are 0 words from 49183 without embeddings
word_vectors = umap.UMAP(n_components=5).fit_transform(word_vectors)
word2vec_map = dict()
for i, word in enumerate(words):
word2vec_map[word] = word_vectors[i]
sentence_vectors = []
for email in emails:
vec = np.zeros(5)
for word in email:
# no exception, all words have embeddings
vec += word2vec_map[word]
vec /= len(email)
sentence_vectors.append(vec)
hdbscan_ = hdbscan.HDBSCAN(min_cluster_size=7, core_dist_n_jobs=70)
hdbscan_.fit(sentence_vectors)
labels = sorted(set(hdbscan_.labels_))
print('There are %s clusters' % len(labels))
There are 1335 clusters
cluster_vectors = []
sentence_vectors = np.array(sentence_vectors)
# first cluster in outliers
for label in labels[1:]:
vec = np.mean(sentence_vectors[hdbscan_.labels_ == label], axis=0)
cluster_vectors.append(vec)
cluster_vectors = np.array(cluster_vectors)
cluster_vectors.shape, word_vectors.shape
((1334, 5), (49183, 5))
def show_cluster(num, cluster_size=10):
dists = np.sum((word_vectors - cluster_vectors[num]) ** 2, axis=1)
order = np.argsort(dists)[:cluster_size]
print(words[order])
# 3, 54, 157, 108, 110, 194
show_cluster(110)
['text' 'follow' 'txt' 'plz' 'reply' 'pls' 'please' 'repeat' 'anniversary' 'surprise']
show_cluster(54)
['angel' 'queen' 'princess' 'prince' 'savage' 'poetic' 'explicit' 'remix' 'prod' 'barbie']
show_cluster(3)
['hawaii' 'disneyland' 'jamaica' 'alaska' 'beach' 'resort' 'skyline' 'harbor' 'atlantic' 'caribbean']
show_cluster(194)
['satisify' 'vain' 'endless' 'definition' 'quite' 'priceless' 'compete' 'successfully' 'separate' 'genius']
show_cluster(157)
['heterosexual' 'irresistible' 'homosexual' 'visible' 'invisible' 'singular' 'peculiar' 'plural' 'neutral' 'favorable']
show_cluster(108)
['teen' 'interracial' 'wannabe' 'lesbian' 'amateur' 'porn' 'ebony' 'sex' 'anal' 'cougar']
word_vectors_umap = umap.UMAP(n_components=2).fit_transform(word_vectors)
# word_vectors_pca = TSNE(n_components=2, n_jobs=40).fit_transform(word_vectors)
# word_vectors_pca = PCA(n_components=2).fit_transform(word_vectors)
word_vectors_umap = StandardScaler().fit_transform(word_vectors_umap)
output_notebook()
def draw_vectors(x, y, radius=10, alpha=0.25, color='blue',
width=600, height=400, show=True, **kwargs):
""" draws an interactive plot for data points with auxilirary info on hover """
if isinstance(color, str): color = [color] * len(x)
data_source = bm.ColumnDataSource({ 'x' : x, 'y' : y, 'color': color, **kwargs })
fig = pl.figure(active_scroll='wheel_zoom', width=width, height=height)
fig.scatter('x', 'y', size=radius, color='color', alpha=alpha, source=data_source)
fig.add_tools(bm.HoverTool(tooltips=[(key, "@" + key) for key in kwargs.keys()]))
if show: pl.show(fig)
return fig
draw_vectors(word_vectors_umap[:, 0], word_vectors_umap[:, 1], token=words)
model_2.wv.most_similar('russia')
[('ukraine', 0.8960039019584656),
('canada', 0.8308505415916443),
('russian', 0.8201991319656372),
('electricite', 0.814449667930603),
('sweden', 0.8108468651771545),
('moscow', 0.8099092245101929),
('greece', 0.8071787357330322),
('norway', 0.8041681051254272),
('zealand', 0.8002776503562927),
('belgium', 0.7978895902633667)]
model_2.wv.most_similar(positive=['queen', 'boy'], negative=['girl'])
[('king', 0.8481787443161011),
('prince', 0.847251296043396),
('angel', 0.8389350175857544),
('lady', 0.8351561427116394),
('aka', 0.8251813650131226),
('punk', 0.8228825926780701),
('princess', 0.8196807503700256),
('rock', 0.7763223648071289),
('band', 0.7758490443229675),
('monster', 0.7747203707695007)]
from bertopic import BERTopic
/usr/local/lib/python3.8/dist-packages/packaging/version.py:127: DeprecationWarning: Creating a LegacyVersion has been deprecated and will be removed in the next major release warnings.warn(
emails_bert = list(df['Content'])
len(emails_bert)
50000
def tokenize(x):
return ' '.join(nltk_tokenizer.tokenize(x))
nltk_tokenizer = WordPunctTokenizer()
emails_bert = [tokenize(email) for email in emails_bert]
topic_model = BERTopic()
topics, _ = topic_model.fit_transform(emails_bert)
def show_topic(num):
return [x[0] for x in topic_model.get_topic(num) if x[1] > 0]
show_topic(366)
['prick', 'sucks', 'terrible', 'yellowpages', 'dirty', 'bad', 'dislike']
show_topic(261)
['congrats', 'congratulations', 'exciting', 'promotions', 'jeff']
show_topic(327)
['superstar', 'choate', 'wonderful', 'success', 'gifts', 'huge', 'nice', 'informative', 'kimberly', 'glad']
show_topic(1164)
['drop', 'pick', 'damon', 'butch', 'dropping', 'picking', 'richie', 'rid', 'neal', 'kirby']
show_topic(394)
['vpn', 'pending', 'applications', 'requested', 'id', 'natgas', 'jeffrey', 'phillip', 'application', 'jeff']
topic_model.visualize_topics()